setwd("~/Desktop/Coursework_files")
getwd()
## [1] "/Users/sangbin/Desktop/Coursework_files"
library(vioplot)
## Loading required package: sm
## Package 'sm', version 2.2-5.7: type help(sm) for summary information
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tidyr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggstatsplot)
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggpubr)
library(caret)
## Loading required package: lattice
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(superml)
## Loading required package: R6
library(caTools)
library(ranger)
library(rpart)
library(ROSE)
## Loaded ROSE 0.0-4
airports <- read.csv("./dataverse_files/airports.csv", sep=",", header = TRUE)
carriers <- read.csv("./dataverse_files/carriers.csv", sep=",", header = TRUE)
planes <- read.csv("./dataverse_files/plane-data.csv", sep=",", header = TRUE)
variable_descriptions <- read.csv("/Users/sangbin/Desktop/Coursework_files/dataverse_files/variable-descriptions.csv",sep=",", header = TRUE)
df_03 <- read.csv("./dataverse_files/2003.csv.bz2", sep=",", header = TRUE)
df_04 <- read.csv("./dataverse_files/2004.csv.bz2", sep=",", header = TRUE)
df_05 <- read.csv("./dataverse_files/2005.csv.bz2", sep=",", header = TRUE)
df <- rbind(df_03, df_04, df_05)
df$Date <- make_date(df$Year,df$Month,df$DayofMonth)
df_copy <- df
df_copy <- subset(df_copy, Cancelled == 0 & Diverted == 0)
df_copy <- subset(df_copy, select=-c(CancellationCode))
df_copy$DelayedArr <- 0
df_copy$DelayedDep <- 0
df_copy$DelayedArr[df_copy$ArrDelay > 15] <- 1
df_copy$DelayedDep[df_copy$DepDelay > 15] <- 1
df_copy$CarrierDelay <- df_copy$CarrierDelay %>% replace_na(0)
df_copy$WeatherDelay <- df_copy$WeatherDelay %>% replace_na(0)
df_copy$NASDelay <- df_copy$NASDelay %>% replace_na(0)
df_copy$SecurityDelay <- df_copy$SecurityDelay %>% replace_na(0)
df_copy$LateAircraftDelay <- df_copy$LateAircraftDelay %>% replace_na(0)
summary(df_copy)
## Year Month DayofMonth DayOfWeek DepTime
## Min. :2003 Min. : 1.000 Min. : 1.00 Min. :1.000 Min. : 1
## 1st Qu.:2003 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000 1st Qu.: 936
## Median :2004 Median : 7.000 Median :16.00 Median :4.000 Median :1330
## Mean :2004 Mean : 6.524 Mean :15.73 Mean :3.942 Mean :1345
## 3rd Qu.:2005 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000 3rd Qu.:1733
## Max. :2005 Max. :12.000 Max. :31.00 Max. :7.000 Max. :2805
##
## CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum
## Min. : 0 Min. : 1 Min. : 0 Length:20356257 Min. : 1
## 1st Qu.: 935 1st Qu.:1120 1st Qu.:1125 Class :character 1st Qu.: 584
## Median :1327 Median :1522 Median :1525 Mode :character Median :1415
## Mean :1339 Mean :1497 Mean :1504 Mean :2038
## 3rd Qu.:1725 3rd Qu.:1915 3rd Qu.:1910 3rd Qu.:2928
## Max. :2400 Max. :2955 Max. :2400 Max. :9912
##
## TailNum ActualElapsedTime CRSElapsedTime AirTime
## Length:20356257 Min. :-710.0 Min. : -32 Min. :-3818.0
## Class :character 1st Qu.: 72.0 1st Qu.: 75 1st Qu.: 53.0
## Mode :character Median : 104.0 Median : 105 Median : 84.0
## Mean : 123.5 Mean : 125 Mean : 102.6
## 3rd Qu.: 154.0 3rd Qu.: 155 3rd Qu.: 134.0
## Max. :1849.0 Max. :1441 Max. : 3508.0
## NA's :1
## ArrDelay DepDelay Origin Dest
## Min. :-1302.000 Min. :-1410.000 Length:20356257 Length:20356257
## 1st Qu.: -10.000 1st Qu.: -4.000 Class :character Class :character
## Median : -2.000 Median : 0.000 Mode :character Mode :character
## Mean : 5.827 Mean : 7.304
## 3rd Qu.: 10.000 3rd Qu.: 5.000
## Max. : 1925.000 Max. : 1930.000
## NA's :1
## Distance TaxiIn TaxiOut Cancelled Diverted
## Min. : 8 Min. : 0.000 Min. : 0.00 Min. :0 Min. :0
## 1st Qu.: 308 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0 1st Qu.:0
## Median : 551 Median : 5.000 Median : 13.00 Median :0 Median :0
## Mean : 719 Mean : 7.561 Mean : 15.69 Mean :0 Mean :0
## 3rd Qu.: 948 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0 3rd Qu.:0
## Max. :4962 Max. :1523.000 Max. :3905.00 Max. :0 Max. :0
##
## CarrierDelay WeatherDelay NASDelay SecurityDelay
## Min. : 0.000 Min. : 0.0000 Min. : -60.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.0000 Median : 0.000 Median : 0.0000
## Mean : 2.399 Mean : 0.5766 Mean : 2.966 Mean : 0.0197
## 3rd Qu.: 0.000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. :1925.000 Max. :1510.0000 Max. :1385.000 Max. :533.0000
##
## LateAircraftDelay Date DelayedArr DelayedDep
## Min. : 0.000 Min. :2003-01-01 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:2003-10-19 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.000 Median :2004-07-19 Median :0.0000 Median :0.0000
## Mean : 2.985 Mean :2004-07-12 Mean :0.1848 Mean :0.1517
## 3rd Qu.: 0.000 3rd Qu.:2005-04-11 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1407.000 Max. :2005-12-31 Max. :1.0000 Max. :1.0000
##
df_copy <- df_copy %>% drop_na(ArrDelay)
df_copy$DepHour = df_copy$DepTime %/% 100L
df_copy$ArrHour = df_copy$ArrTime %/% 100L
df_copy$CRSDepHour = df_copy$CRSDepTime %/% 100L
df_copy$CRSArrHour = df_copy$CRSArrTime %/% 100L
df_copy$DepHour <- as.character(df_copy$DepHour)
df_copy$ArrHour <- as.character(df_copy$ArrHour)
df_copy$CRSDepHour <- as.character(df_copy$CRSDepHour)
df_copy$CRSArrHour <- as.character(df_copy$CRSArrHour)
df_copy$DepHour = with(df_copy, ifelse(DepHour %in% c("24","0") , "00",
ifelse(DepHour %in% c("25","1") , "01",
ifelse(DepHour %in% c("26","2") , "02",
ifelse(DepHour %in% c("27","3") , "03",
ifelse(DepHour %in% c("28","4") , "04",
ifelse(DepHour %in% c("29","5") , "05",
ifelse(DepHour %in% c("6") , "06",
ifelse(DepHour %in% c("7") , "07",
ifelse(DepHour %in% c("8") , "08",
ifelse(DepHour %in% c("9") , "09",
DepHour)))))))))))
df_copy$ArrHour = with(df_copy, ifelse(ArrHour %in% c("24","0") , "00",
ifelse(ArrHour %in% c("25","1") , "01",
ifelse(ArrHour %in% c("26","2") , "02",
ifelse(ArrHour %in% c("27","3") , "03",
ifelse(ArrHour %in% c("28","4") , "04",
ifelse(ArrHour %in% c("29","5") , "05",
ifelse(ArrHour %in% c("6") , "06",
ifelse(ArrHour %in% c("7") , "07",
ifelse(ArrHour %in% c("8") , "08",
ifelse(ArrHour %in% c("9") , "09",
ArrHour)))))))))))
df_copy$CRSDepHour = with(df_copy, ifelse(CRSDepHour %in% c("24","0") , "00",
ifelse(CRSDepHour %in% c("25","1") , "01",
ifelse(CRSDepHour %in% c("26","2") , "02",
ifelse(CRSDepHour %in% c("27","3") , "03",
ifelse(CRSDepHour %in% c("28","4") , "04",
ifelse(CRSDepHour %in% c("29","5") , "05",
ifelse(CRSDepHour %in% c("6") , "06",
ifelse(CRSDepHour %in% c("7") , "07",
ifelse(CRSDepHour %in% c("8") , "08",
ifelse(CRSDepHour %in% c("9") , "09",
CRSDepHour)))))))))))
df_copy$CRSArrHour = with(df_copy, ifelse(CRSArrHour %in% c("24","0") , "00",
ifelse(CRSArrHour %in% c("25","1") , "01",
ifelse(CRSArrHour %in% c("26","2") , "02",
ifelse(CRSArrHour %in% c("27","3") , "03",
ifelse(CRSArrHour %in% c("28","4") , "04",
ifelse(CRSArrHour %in% c("29","5") , "05",
ifelse(CRSArrHour %in% c("6") , "06",
ifelse(CRSArrHour %in% c("7") , "07",
ifelse(CRSArrHour %in% c("8") , "08",
ifelse(CRSArrHour %in% c("9") , "09",
CRSArrHour)))))))))))
Distance_delay <- df_copy %>%
group_by(Distance) %>%
summarise(Average_ArrDelay = mean(ArrDelay))
a<- ggplot(data = Distance_delay , mapping = aes(x = Distance, y = Average_ArrDelay)) +
geom_line(stat='identity') +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of arrival delay based on distance",x = "Distance", y ="Average ArrDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
b<-ggplot(data = Distance_delay ,aes(x = Distance)) +
geom_histogram(aes(y=..density..),
binwidth=100) +
labs(title = "Frequency of the flight over distance",x = "Distance", y ="Frequency of the flight") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))+
geom_density(alpha=.2, fill="white")
ggarrange(a,b,
labels = c("A", "B"),
ncol = 2, nrow = 1)
We can observe that shorter distanced flights seem to have more fluctuation on delays compared to long distance flights.
We can also observe that there are more of shorter distanced flights compared to long distance flights.
Delayed_flights <- filter(df_copy,df_copy$DelayedArr == 1)
c <- ggplot(data=Delayed_flights, aes(y=ArrDelay))+
geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
labs(title = "ArrDelay Boxplot(before)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
d <- ggplot(data=Delayed_flights, aes(y=DepDelay))+
geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
labs(title = "DepDelay Boxplot(before)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
ggarrange(c,d,
labels = c("C", "D"),
ncol = 2, nrow = 1)
Q_ArrDelay <- quantile(Delayed_flights$ArrDelay, probs=c(.25, .75), na.rm = FALSE)
IQR_ArrDelay <- IQR(Delayed_flights$ArrDelay)
Upper_Whisker_ArrDelay <- Q_ArrDelay[2]+1.5*IQR_ArrDelay
Lower_Whisker_ArrDelay <- Q_ArrDelay[1]-1.5*IQR_ArrDelay
Q_DepDelay <- quantile(Delayed_flights$DepDelay, probs=c(.25, .75), na.rm = FALSE)
IQR_DepDelay <- IQR(Delayed_flights$DepDelay)
Upper_Whisker_DepDelay <- Q_DepDelay[2]+1.5*IQR_DepDelay
Lower_Whisker_DepDelay <- Q_DepDelay[1]-1.5*IQR_DepDelay
Delayed_flights<- subset(Delayed_flights, Delayed_flights$ArrDelay > Lower_Whisker_ArrDelay &
Delayed_flights$ArrDelay < Upper_Whisker_ArrDelay &
Delayed_flights$DepDelay > Lower_Whisker_DepDelay &
Delayed_flights$DepDelay < Upper_Whisker_DepDelay)
e <- ggplot(data=Delayed_flights, aes(y=ArrDelay))+
geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
labs(title = "ArrDelay Boxplot(after)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
f <- ggplot(data=Delayed_flights, aes(y=DepDelay))+
geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
labs(title = "DepDelay Boxplot(after)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
ggarrange(e,f,
labels = c("E", "F"),
ncol = 2, nrow = 1)
total_flights_per_DepHour <- df_copy %>%
group_by(DepHour) %>%
summarise(Count = sum(DelayedArr))
g <- ggplot(data = total_flights_per_DepHour, mapping = aes(x = reorder(DepHour,-Count), y = Count)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Total number of flights per hour",x = "Actual Departure Hour", y ="Number of flights") +
theme(plot.title = element_text(hjust = 0.5))
delayed_flights_per_DepHour <- Delayed_flights %>%
group_by(DepHour) %>%
summarise(Count = sum(DelayedArr))
h <- ggplot(data = delayed_flights_per_DepHour, mapping = aes(x = reorder(DepHour,-Count), y = Count)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Number of delayed flights per hour",x = "Actual Departure Hour", y ="Number of delayed flights") +
theme(plot.title = element_text(hjust = 0.5))
ggarrange(g,h,
labels = c("G", "H"),
ncol = 2, nrow = 1)
Delayed_flights_DepHour <- Delayed_flights %>%
group_by(DepHour) %>%
summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))
i <- ggplot(data = Delayed_flights_DepHour, mapping = aes(x = DepHour, y = Average_DepDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_DepDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of delay over the hours day",x = "Actual Departure Hour", y ="Average DepDelay (min)") +
theme(plot.title = element_text(hjust = 0.5))
j <- ggplot(data = Delayed_flights_DepHour, mapping = aes(x = DepHour, y = Average_ArrDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of delay over the hours day",x = "Actual Departure Hour", y ="Average ArrDelay (min)") +
theme(plot.title = element_text(hjust = 0.5))
Delayed_flights_CRSDepHour <- Delayed_flights %>%
group_by(CRSDepHour) %>%
summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))
k <- ggplot(data = Delayed_flights_CRSDepHour, mapping = aes(x = CRSDepHour, y = Average_DepDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_DepDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of delay over the hours day",x = "CRS Departure Hour", y ="Average DepDelay (min)") +
theme(plot.title = element_text(hjust = 0.5))
l <- ggplot(data = Delayed_flights_CRSDepHour, mapping = aes(x = CRSDepHour, y = Average_ArrDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of delay over the hours day",x = "CRS Departure Hour", y ="Average ArrDelay (min)") +
theme(plot.title = element_text(hjust = 0.5))
ggarrange(i,j,k,l,
labels = c("I", "J","K","L"),
ncol = 2, nrow = 2)
Delayed_flights_Day <- Delayed_flights %>%
group_by(DayOfWeek) %>%
summarise(Count = sum(DelayedArr),Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))
m <-ggplot(data = Delayed_flights_Day, mapping = aes(x = reorder(wday(DayOfWeek,label = TRUE),-Average_ArrDelay), y = Average_ArrDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Average Arrival Delay by Day",x = "Day", y ="Average ArrDelay (min)") +
theme(plot.title = element_text(hjust = 0.5))
n <- ggplot(data = Delayed_flights_Day, mapping = aes(x = reorder(wday(DayOfWeek,label = TRUE),-Count), y =Count)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Number of Arrival Delay by Day",x = "Day", y ="Arrival Delay Count") +
theme(plot.title = element_text(hjust = 0.5))
ggarrange(m,n,
labels = c("M", "N"),
ncol = 2, nrow = 1)
Delayed_flights_Month <- Delayed_flights %>%
group_by(Month) %>%
summarise(Count= sum(DelayedArr),Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))
o <- ggplot(data = Delayed_flights_Month, mapping = aes(x = reorder(month(Month,label = TRUE),-Average_ArrDelay), y = Average_ArrDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Average Arrival Delay by Month",x = "Month", y ="Average ArrDelay (min)")+
theme(plot.title = element_text(hjust = 0.5))
p <- ggplot(data = Delayed_flights_Month, mapping = aes(x = reorder(month(Month,label = TRUE),-Count), y =Count)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Number of Arrival Delay by Day",x = "Month", y ="Arrival Delay Count") +
theme(plot.title = element_text(hjust = 0.5))
ggarrange(o,p,
labels = c("O", "P"),
ncol = 2, nrow = 1)
*From the plots, we can identify the best time to minimize delays.
plane_characteristic <- Delayed_flights
plane_characteristic <- left_join(plane_characteristic, planes, by = c("TailNum"='tailnum'))%>%
group_by(year) %>%
summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay), Average_CarrierDelay = mean(CarrierDelay))
plane_characteristic <- plane_characteristic %>%
filter(!is.na(year), plane_characteristic$year !="",plane_characteristic$year !="0000",plane_characteristic$year !="None",plane_characteristic$year !="2007")
q<- ggplot(data = plane_characteristic, mapping = aes(x = year, y = Average_ArrDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of delay over the engine year of the plane",x = "Engine Year", y ="Average ArrDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
r<- ggplot(data = plane_characteristic, mapping = aes(x = year, y = Average_DepDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of delay over the engine year of the plane",x = "Engine Year", y ="Average DepDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
s<- ggplot(data = plane_characteristic, mapping = aes(x = year, y = Average_CarrierDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of delay over the engine year of the plane",x = "Engine Year", y ="Average CarrierDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
ggarrange(q,r,s,
labels = c("Q", "R", "S"),
ncol = 2, nrow = 2)
Destination <- df_copy
Destination$season <- month(Destination$Date,label = TRUE)
Destination$trip <-paste(Destination$Origin,Destination$Dest,sep="-")
Destination$season = with(Destination, ifelse(season%in% c("Dec","Jan","Feb") , "Winter",
ifelse(season %in% c("Mar","Apr","May") , "Spring",
ifelse(season %in% c("Jun","Jul","Aug") , "Summer",
ifelse(season %in% c("Sep","Oct","Nov") , "Autumn",
season)))))
Destination <- left_join(Destination, airports, by = c("Dest"='iata'))
To identify the number of people flying between different locations over time, plot most visited trips, airport, city, and state.
Top 10 Most Frequent Trips
Destination_trip <- Destination %>%
group_by(season,trip)%>%
summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
Destination_trip %>%
arrange(desc(count)) %>%
slice(1:10) %>%
ggplot(., mapping = aes(x = reorder(trip, -count), y = count)) +
facet_wrap(~season)+
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(count,0), fontface = 'bold', vjust = -0.2), size = 1.5) +
labs(title = "Top 10 Most Frequent Trips",x = "Trip", y ="Total Trip Count") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
Destination_airport <- Destination %>%
group_by(season,airport)%>%
summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
Destination_airport %>%
arrange(desc(count)) %>%
slice(1:10) %>%
ggplot(., mapping = aes(x = reorder(airport, -count), y = count)) +
facet_wrap(~season)+
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(count,0), fontface = 'bold', vjust = -0.2), size = 1.5) +
labs(title = "Top 10 Most Frequent airports",x = "airport", y ="Total airport Count") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
Destination_city <- Destination %>%
group_by(season,city)%>%
summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
Destination_city %>%
arrange(desc(count)) %>%
slice(1:10) %>%
ggplot(., mapping = aes(x = reorder(city, -count), y = count)) +
facet_wrap(~season)+
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(count,0), fontface = 'bold', hjust = .5,vjust = 2), size = 2) +
labs(title = "Top 10 Most Frequent citys",x = "city", y ="Total city Count") +
theme(plot.title = element_text(hjust = 0.5), # title
axis.text.x=element_text(angle=90, hjust=1, size = 8), # x axis
axis.text.y=element_text(hjust=1, size = 8), # y axis
strip.text = element_text(size = 8))#facet text
Destination_state <- Destination %>%
group_by(season,state)%>%
summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
Destination_state %>%
arrange(desc(count)) %>%
slice(1:10) %>%
ggplot(., mapping = aes(x = reorder(state, -count), y = count)) +
facet_wrap(~season)+
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(count,0), fontface = 'bold', hjust = .5,vjust = 2), size = 2) +
labs(title = "Top 10 Most Frequent citys",x = "city", y ="Total city Count") +
theme(plot.title = element_text(hjust = 0.5), # title
axis.text.x=element_text(angle=90, hjust=1, size = 8), # x axis
axis.text.y=element_text(hjust=1, size = 8), # y axis
strip.text = element_text(size = 8))#facet text
Airport <- filter(Destination,Destination$LateAircraftDelay > 0)
Airport <- subset(Airport, Airport$DelayedArr == 1 & Airport$DelayedDep == 1)
Airport_cascading_failure <- subset(Airport, Airport$TailNum %in% c("N336","N337"))
no_cascading_failure <- Airport_cascading_failure %>%
filter(Date =="2004-04-12" & FlightNum == 2967)%>%
arrange(CRSDepHour)
t <- ggplot(data = no_cascading_failure, mapping = aes(x = CRSDepHour, y = LateAircraftDelay)) +
geom_line(group=1) +
geom_text(mapping = aes(label = LateAircraftDelay, fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "LateAircraftDelay on 2004-04-12 for tail N336",x = "CRSDepHour", y ="LateAircraftDelay (min)") +
theme(plot.title = element_text(hjust = 0.5))
cascading_failure <- Airport_cascading_failure %>%
filter(Date =="2005-06-10" & TailNum == "N337")%>%
arrange(CRSDepHour)
u <- ggplot(data = cascading_failure, mapping = aes(x = CRSDepHour, y = LateAircraftDelay)) +
geom_line(group=1) +
geom_text(mapping = aes(label = LateAircraftDelay, fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "LateAircraftDelay on 2005-06-10 for tail N337",x = "CRSDepHour", y ="LateAircraftDelay (min)") +
theme(plot.title = element_text(hjust = 0.5))
ggarrange(t,u,
labels = c("T", "U"),
ncol = 2, nrow = 1)
Longest_carrier_delay_flights <- Airport %>%
group_by(airport) %>%
summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay), Average_CarrierDelay = mean(CarrierDelay))
v <- Longest_carrier_delay_flights %>%
arrange(desc(Average_DepDelay)) %>%
slice(1:10) %>%
ggplot(., mapping = aes(x = reorder(airport, -Average_DepDelay), y =Average_DepDelay)) +
geom_bar(stat='identity',fill="steelblue")+
geom_text(mapping = aes(label = round(Average_DepDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Top 10 airports with longest departure delay",x = "Airport", y ="Average DepDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
w <- Longest_carrier_delay_flights %>%
arrange(desc(Average_ArrDelay)) %>%
slice(1:10) %>%
ggplot(., mapping = aes(x = reorder(airport, -Average_ArrDelay), y =Average_ArrDelay)) +
geom_bar(stat='identity',fill="steelblue")+
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Top 10 airports with longest arrival delay",x = "Airport", y ="Average ArrDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
x <- Longest_carrier_delay_flights %>%
arrange(desc(Average_CarrierDelay)) %>%
slice(1:10) %>%
ggplot(., mapping = aes(x = reorder(airport, -Average_CarrierDelay), y =Average_CarrierDelay)) +
geom_bar(stat='identity',fill="steelblue")+
geom_text(mapping = aes(label = round(Average_CarrierDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Top 10 airports with longest carrier delay",x = "Airport", y ="Average CarrierDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
ggarrange(v,w,x,
labels = c("V", "W", "X"),
ncol = 2, nrow = 2)
Airline <- Delayed_flights
Airline <- left_join(Airline, carriers, by = c("UniqueCarrier"='Code'))%>%
group_by(UniqueCarrier) %>%
summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay), Average_CarrierDelay = mean(CarrierDelay))
y <- ggplot(data = Airline, mapping = aes(x = UniqueCarrier, y = Average_ArrDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of arrival delay over Airline",x = "Airline Code", y ="Average ArrDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
z <- ggplot(data = Airline, mapping = aes(x = UniqueCarrier, y = Average_DepDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of departure delay over Airline",x = "Airline Code", y ="Average DepDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
aa<- ggplot(data = Airline, mapping = aes(x = UniqueCarrier, y = Average_CarrierDelay)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Distribution of carrier delay over Airline",x = "Airline Code", y ="Average CarrierDelay (min)") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))
ggarrange(y,z,aa,
labels = c("Y", "Z", "AA"),
ncol = 2, nrow = 2)
prediction <- df_copy %>% filter(row_number() %% 1000 == 0) %>%
dplyr::select(c(DelayedArr,CRSDepHour,Origin,UniqueCarrier,Month,Distance))
prediction[prediction$DelayedArr == 0,]$DelayedArr <- "on-time"
prediction[prediction$DelayedArr == 1,]$DelayedArr <- "delayed"
prediction$DelayedArr <- as.factor(prediction$DelayedArr)
lbl <- LabelEncoder$new()
prediction$CRSDepHour <- lbl$fit_transform(prediction$CRSDepHour)
prediction$Origin <- lbl$fit_transform(prediction$Origin)
prediction$UniqueCarrier <- lbl$fit_transform(prediction$UniqueCarrier)
set.seed(1)
training_sample <- createDataPartition(prediction$DelayedArr,p = 0.8, list = FALSE)
dataset<- prediction[training_sample,]
validation <- prediction[-training_sample,]
Since our aim is to predict delayed flights, pick “DelayedArr” as independent variable(label). Pick dependent variables(features) that are available in the future flights and assumed to be influential to delays.
split input and output
output <- dataset[,1]
input <- dataset[,2:6]
par(mfrow=c(1,5))
for(i in 2:6) {vioplot(prediction[,i], main=names(input)[i-1])}
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
table(dataset$DelayedArr)
##
## delayed on-time
## 3041 13244
prop.table(table(dataset$DelayedArr))
##
## delayed on-time
## 0.1867363 0.8132637
par(mfrow=c(1, 1))
barplot(prop.table(table(dataset$DelayedArr)),
ylim = c(0, 0.9),
main = "Class Distribution")
set.seed(1)
undersampling <- ovun.sample(DelayedArr~., data=dataset, method = "under",N = 6082)$data
table(undersampling$DelayedArr)
##
## on-time delayed
## 3041 3041
set.seed(1)
oversampling <- ovun.sample(DelayedArr~., data=dataset, method = "over",N = 26488)$data
table(oversampling$DelayedArr)
##
## on-time delayed
## 13244 13244
set.seed(1)
bothsampling <- ovun.sample(DelayedArr~., data=dataset, method = "both",N = 16285, p=.5)$data
table(bothsampling$DelayedArr)
##
## on-time delayed
## 8189 8096
tree.over <- rpart(DelayedArr~., data=oversampling)
tree.under <- rpart(DelayedArr~., data=undersampling)
tree.both <- rpart(DelayedArr~., data=bothsampling)
pred.tree.over <- predict(tree.over, newdata = dataset)
pred.tree.under <- predict(tree.under, newdata = dataset)
pred.tree.both <- predict(tree.both, newdata = dataset)
roc.curve(dataset$DelayedArr, pred.tree.under[,2])
## Area under the curve (AUC): 0.580
roc.curve(dataset$DelayedArr, pred.tree.over[,2])
## Area under the curve (AUC): 0.550
roc.curve(dataset$DelayedArr, pred.tree.both[,2])
## Area under the curve (AUC): 0.579
set.seed(1)
fit.glm_undersampling <- train(DelayedArr~., data=undersampling, method="glm", metric=metric, trControl=control, family="binomial")
set.seed(1)
fit.glm_oversampling <- train(DelayedArr~., data=oversampling, method="glm", metric=metric, trControl=control, family="binomial")
set.seed(1)
fit.glm_bothsampling <- train(DelayedArr~., data=bothsampling, method="glm", metric=metric, trControl=control, family="binomial")
predictions_glm_undersampling <- predict(fit.glm_undersampling, validation)
predictions_glm_oversampling <- predict(fit.glm_oversampling, validation)
predictions_glm_bothsampling <- predict(fit.glm_bothsampling, validation)
confusionMatrix(predictions_glm_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_glm_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 430 1545
## on-time 330 1766
##
## Accuracy : 0.5394
## 95% CI : (0.524, 0.5548)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0614
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5658
## Specificity : 0.5334
## Pos Pred Value : 0.2177
## Neg Pred Value : 0.8426
## Prevalence : 0.1867
## Detection Rate : 0.1056
## Detection Prevalence : 0.4851
## Balanced Accuracy : 0.5496
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_glm_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_glm_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 429 1549
## on-time 331 1762
##
## Accuracy : 0.5382
## 95% CI : (0.5227, 0.5536)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0598
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5645
## Specificity : 0.5322
## Pos Pred Value : 0.2169
## Neg Pred Value : 0.8419
## Prevalence : 0.1867
## Detection Rate : 0.1054
## Detection Prevalence : 0.4859
## Balanced Accuracy : 0.5483
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_glm_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_glm_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 404 1483
## on-time 356 1828
##
## Accuracy : 0.5483
## 95% CI : (0.5328, 0.5636)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0533
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.53158
## Specificity : 0.55210
## Pos Pred Value : 0.21410
## Neg Pred Value : 0.83700
## Prevalence : 0.18669
## Detection Rate : 0.09924
## Detection Prevalence : 0.46352
## Balanced Accuracy : 0.54184
##
## 'Positive' Class : delayed
##
results <- resamples(list(under=fit.glm_undersampling, over=fit.glm_oversampling, both=fit.glm_bothsampling))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: under, over, both
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under 0.5180921 0.5242599 0.5382124 0.5391318 0.5449690 0.5789474 0
## over 0.5311438 0.5405814 0.5502927 0.5475695 0.5545695 0.5590789 0
## both 0.5224064 0.5264128 0.5296283 0.5357062 0.5475057 0.5518723 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under 0.03618421 0.04851974 0.07644376 0.07827149 0.08996921 0.1578947 0
## over 0.06226774 0.08116872 0.10057775 0.09513666 0.10913897 0.1181582 0
## both 0.04466697 0.05264292 0.05907156 0.07131276 0.09503272 0.1035835 0
set.seed(1)
fit.rf_undersampling <- train(DelayedArr~., data=undersampling, method="rf", metric=metric, trControl=control)
set.seed(1)
fit.rf_oversampling <- train(DelayedArr~., data=oversampling, method="rf", metric=metric, trControl=control)
set.seed(1)
fit.rf_bothsampling <- train(DelayedArr~., data=bothsampling, method="rf", metric=metric, trControl=control)
predictions_rf_undersampling <- predict(fit.rf_undersampling, validation)
predictions_rf_oversampling <- predict(fit.rf_oversampling, validation)
predictions_rf_bothsampling <- predict(fit.rf_bothsampling, validation)
confusionMatrix(predictions_rf_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_rf_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 422 1510
## on-time 338 1801
##
## Accuracy : 0.5461
## 95% CI : (0.5306, 0.5614)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0622
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5553
## Specificity : 0.5439
## Pos Pred Value : 0.2184
## Neg Pred Value : 0.8420
## Prevalence : 0.1867
## Detection Rate : 0.1037
## Detection Prevalence : 0.4746
## Balanced Accuracy : 0.5496
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_rf_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_rf_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 130 360
## on-time 630 2951
##
## Accuracy : 0.7568
## 95% CI : (0.7433, 0.7699)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0722
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.17105
## Specificity : 0.89127
## Pos Pred Value : 0.26531
## Neg Pred Value : 0.82407
## Prevalence : 0.18669
## Detection Rate : 0.03193
## Detection Prevalence : 0.12036
## Balanced Accuracy : 0.53116
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_rf_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_rf_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 242 725
## on-time 518 2586
##
## Accuracy : 0.6947
## 95% CI : (0.6803, 0.7088)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.09
##
## Mcnemar's Test P-Value : 5.129e-09
##
## Sensitivity : 0.31842
## Specificity : 0.78103
## Pos Pred Value : 0.25026
## Neg Pred Value : 0.83312
## Prevalence : 0.18669
## Detection Rate : 0.05944
## Detection Prevalence : 0.23753
## Balanced Accuracy : 0.54973
##
## 'Positive' Class : delayed
##
results <- resamples(list(under=fit.rf_undersampling, over=fit.rf_oversampling, both=fit.rf_bothsampling))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: under, over, both
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under 0.5197368 0.5431743 0.5542763 0.5527817 0.5676138 0.5773026 0
## over 0.9233384 0.9283692 0.9305265 0.9317041 0.9367686 0.9384906 0
## both 0.8919583 0.8955774 0.8989867 0.8997853 0.9046811 0.9097052 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under 0.03947368 0.08634868 0.1085526 0.1055776 0.1352482 0.1546053 0
## over 0.84667674 0.85674166 0.8610554 0.8634082 0.8735329 0.8769811 0
## both 0.78398025 0.79120678 0.7980411 0.7996424 0.8094338 0.8194768 0
set.seed(1)
fit.cart_undersampling <- train(DelayedArr~., data=undersampling, method="rpart", metric=metric, trControl=control)
set.seed(1)
fit.cart_oversampling <- train(DelayedArr~., data=oversampling, method="rpart", metric=metric, trControl=control)
set.seed(1)
fit.cart_bothsampling <- train(DelayedArr~., data=bothsampling, method="rpart", metric=metric, trControl=control)
predictions_cart_undersampling <- predict(fit.cart_undersampling, validation)
predictions_cart_oversampling <- predict(fit.cart_oversampling, validation)
predictions_cart_bothsampling <- predict(fit.cart_bothsampling, validation)
confusionMatrix(predictions_cart_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_cart_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 440 1579
## on-time 320 1732
##
## Accuracy : 0.5335
## 95% CI : (0.5181, 0.5489)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0623
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5789
## Specificity : 0.5231
## Pos Pred Value : 0.2179
## Neg Pred Value : 0.8441
## Prevalence : 0.1867
## Detection Rate : 0.1081
## Detection Prevalence : 0.4959
## Balanced Accuracy : 0.5510
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_cart_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_cart_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 507 1782
## on-time 253 1529
##
## Accuracy : 0.5001
## 95% CI : (0.4846, 0.5156)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0726
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6671
## Specificity : 0.4618
## Pos Pred Value : 0.2215
## Neg Pred Value : 0.8580
## Prevalence : 0.1867
## Detection Rate : 0.1245
## Detection Prevalence : 0.5623
## Balanced Accuracy : 0.5644
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_cart_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_cart_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 469 1758
## on-time 291 1553
##
## Accuracy : 0.4967
## 95% CI : (0.4812, 0.5122)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0494
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6171
## Specificity : 0.4690
## Pos Pred Value : 0.2106
## Neg Pred Value : 0.8422
## Prevalence : 0.1867
## Detection Rate : 0.1152
## Detection Prevalence : 0.5470
## Balanced Accuracy : 0.5431
##
## 'Positive' Class : delayed
##
results <- resamples(list(under=fit.cart_undersampling, over=fit.cart_oversampling, both=fit.cart_bothsampling))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: under, over, both
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under 0.5328947 0.5439967 0.5603961 0.5552424 0.5675557 0.5690789 0
## over 0.5677614 0.5733838 0.5755804 0.5765629 0.5810683 0.5858815 0
## both 0.5307125 0.5419226 0.5551129 0.5522230 0.5604665 0.5690608 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under 0.06578947 0.08799342 0.1209316 0.1104634 0.1347427 0.1381579 0
## over 0.13558603 0.14670775 0.1511140 0.1531257 0.1621915 0.1718376 0
## both 0.06120970 0.08352119 0.1107704 0.1050697 0.1221803 0.1401949 0
set.seed(1)
fit.knn_undersampling <- train(DelayedArr~., data=undersampling, method="knn", metric=metric, trControl=control)
set.seed(1)
fit.knn_oversampling <- train(DelayedArr~., data=oversampling, method="knn", metric=metric, trControl=control)
set.seed(1)
fit.knn_bothsampling <- train(DelayedArr~., data=bothsampling, method="knn", metric=metric, trControl=control)
predictions_knn_undersampling <- predict(fit.knn_undersampling, validation)
predictions_knn_oversampling <- predict(fit.knn_oversampling, validation)
predictions_knn_bothsampling <- predict(fit.knn_bothsampling, validation)
confusionMatrix(predictions_knn_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_knn_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 397 1619
## on-time 363 1692
##
## Accuracy : 0.5131
## 95% CI : (0.4977, 0.5286)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0204
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.52237
## Specificity : 0.51102
## Pos Pred Value : 0.19692
## Neg Pred Value : 0.82336
## Prevalence : 0.18669
## Detection Rate : 0.09752
## Detection Prevalence : 0.49521
## Balanced Accuracy : 0.51670
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_knn_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_knn_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 404 1654
## on-time 356 1657
##
## Accuracy : 0.5063
## 95% CI : (0.4908, 0.5217)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0193
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.53158
## Specificity : 0.50045
## Pos Pred Value : 0.19631
## Neg Pred Value : 0.82315
## Prevalence : 0.18669
## Detection Rate : 0.09924
## Detection Prevalence : 0.50553
## Balanced Accuracy : 0.51602
##
## 'Positive' Class : delayed
##
confusionMatrix(predictions_knn_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_knn_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction delayed on-time
## delayed 405 1556
## on-time 355 1755
##
## Accuracy : 0.5306
## 95% CI : (0.5151, 0.546)
## No Information Rate : 0.8133
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0391
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.53289
## Specificity : 0.53005
## Pos Pred Value : 0.20653
## Neg Pred Value : 0.83175
## Prevalence : 0.18669
## Detection Rate : 0.09948
## Detection Prevalence : 0.48170
## Balanced Accuracy : 0.53147
##
## 'Positive' Class : delayed
##
results <- resamples(list(under=fit.knn_undersampling, over=fit.knn_oversampling, both=fit.knn_bothsampling))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: under, over, both
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under 0.4851974 0.5102796 0.5160220 0.5152923 0.5246711 0.5312500 0
## over 0.6752266 0.6806342 0.6849157 0.6854415 0.6884968 0.7006418 0
## both 0.6277641 0.6513198 0.6562308 0.6561267 0.6647727 0.6793612 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under -0.02960526 0.02055921 0.03204792 0.03058133 0.04934211 0.0625000 0
## over 0.35045317 0.36136788 0.36978608 0.37088544 0.37692435 0.4012027 0
## both 0.25648767 0.30351370 0.31327833 0.31305003 0.33037773 0.3593412 0
results_total <- resamples(list(under_glm=fit.glm_undersampling, over_glm=fit.glm_oversampling, both_glm=fit.glm_bothsampling,
under_rf=fit.rf_undersampling, over_rf=fit.rf_oversampling, both_rf=fit.rf_bothsampling,
under_cart=fit.cart_undersampling, over_cart=fit.cart_oversampling, both_cart=fit.cart_bothsampling,
under_knn=fit.knn_undersampling, over_knn=fit.knn_oversampling, both_knn=fit.knn_bothsampling))
summary(results_total)
##
## Call:
## summary.resamples(object = results_total)
##
## Models: under_glm, over_glm, both_glm, under_rf, over_rf, both_rf, under_cart, over_cart, both_cart, under_knn, over_knn, both_knn
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## under_glm 0.5180921 0.5242599 0.5382124 0.5391318 0.5449690 0.5789474 0
## over_glm 0.5311438 0.5405814 0.5502927 0.5475695 0.5545695 0.5590789 0
## both_glm 0.5224064 0.5264128 0.5296283 0.5357062 0.5475057 0.5518723 0
## under_rf 0.5197368 0.5431743 0.5542763 0.5527817 0.5676138 0.5773026 0
## over_rf 0.9233384 0.9283692 0.9305265 0.9317041 0.9367686 0.9384906 0
## both_rf 0.8919583 0.8955774 0.8989867 0.8997853 0.9046811 0.9097052 0
## under_cart 0.5328947 0.5439967 0.5603961 0.5552424 0.5675557 0.5690789 0
## over_cart 0.5677614 0.5733838 0.5755804 0.5765629 0.5810683 0.5858815 0
## both_cart 0.5307125 0.5419226 0.5551129 0.5522230 0.5604665 0.5690608 0
## under_knn 0.4851974 0.5102796 0.5160220 0.5152923 0.5246711 0.5312500 0
## over_knn 0.6752266 0.6806342 0.6849157 0.6854415 0.6884968 0.7006418 0
## both_knn 0.6277641 0.6513198 0.6562308 0.6561267 0.6647727 0.6793612 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## under_glm 0.03618421 0.04851974 0.07644376 0.07827149 0.08996921 0.1578947
## over_glm 0.06226774 0.08116872 0.10057775 0.09513666 0.10913897 0.1181582
## both_glm 0.04466697 0.05264292 0.05907156 0.07131276 0.09503272 0.1035835
## under_rf 0.03947368 0.08634868 0.10855263 0.10557763 0.13524822 0.1546053
## over_rf 0.84667674 0.85674166 0.86105538 0.86340824 0.87353293 0.8769811
## both_rf 0.78398025 0.79120678 0.79804109 0.79964241 0.80943378 0.8194768
## under_cart 0.06578947 0.08799342 0.12093158 0.11046342 0.13474274 0.1381579
## over_cart 0.13558603 0.14670775 0.15111395 0.15312575 0.16219145 0.1718376
## both_cart 0.06120970 0.08352119 0.11077038 0.10506973 0.12218029 0.1401949
## under_knn -0.02960526 0.02055921 0.03204792 0.03058133 0.04934211 0.0625000
## over_knn 0.35045317 0.36136788 0.36978608 0.37088544 0.37692435 0.4012027
## both_knn 0.25648767 0.30351370 0.31327833 0.31305003 0.33037773 0.3593412
## NA's
## under_glm 0
## over_glm 0
## both_glm 0
## under_rf 0
## over_rf 0
## both_rf 0
## under_cart 0
## over_cart 0
## both_cart 0
## under_knn 0
## over_knn 0
## both_knn 0
fit.rf_undersampling
## Random Forest
##
## 6082 samples
## 5 predictor
## 2 classes: 'on-time', 'delayed'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5474, 5474, 5474, 5473, 5474, 5474, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.5498204 0.09964762
## 3 0.5513017 0.10261402
## 5 0.5527817 0.10557763
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
It shows that mtry of 5 has the highest accuracy for fit.rf_undersampling
Grid Search
control_grid <- trainControl(method="cv", number=10 , search="grid")
set.seed(1)
fit.rf_undersampling_grid <- train(DelayedArr~., data=undersampling, method="rf", metric=metric, trControl=control_grid)
print(fit.rf_undersampling_grid)
## Random Forest
##
## 6082 samples
## 5 predictor
## 2 classes: 'on-time', 'delayed'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5474, 5474, 5474, 5473, 5474, 5474, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.5498204 0.09964762
## 3 0.5513017 0.10261402
## 5 0.5527817 0.10557763
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
plot(fit.rf_undersampling_grid)
Grid-search takes quite long time to load, hence random search is often used instead. Although it is done within certain amount of time, it has systematic way to find the appropriate model instead of just putting random values several times. Moreover, it is also called coarse search because it proceeds two to three times and gradually narrows down.
RandomSearch
control_random <- trainControl(method="cv", number=10, search="random")
set.seed(1)
fit.rf_undersampling_random <- train(DelayedArr~., data=undersampling, method="rf", metric=metric, trControl=control_random)
print(fit.rf_undersampling_random)
## Random Forest
##
## 6082 samples
## 5 predictor
## 2 classes: 'on-time', 'delayed'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5474, 5474, 5474, 5473, 5474, 5474, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.5753079 0.15061929
## 2 0.5555724 0.11114867
## 5 0.5494928 0.09899791
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
plot(fit.rf_undersampling_random)
rf_permutation <- ranger(DelayedArr~ ., dataset, importance = 'permutation')
rf_permutation_importance <- data.frame(
var = names(rf_permutation$variable.importance), imp = c(rf_permutation$variable.importance))
rf_permutation_importance$data_type <- 'permutation'
ggplot(rf_permutation_importance) +
geom_bar(aes(x=reorder(var, imp), y=imp), stat = 'identity',fill="steelblue") +
labs(title = "Importance of variables",x = "Variables", y ="Importance") +
theme(plot.title = element_text(hjust = 0.5))